# Import train.csv, test.csv and store.csv
train <- read.csv("train.csv", stringsAsFactors = F)
test <- read.csv("test.csv", stringsAsFactors = F)
store <- read.csv("store.csv", stringsAsFactors = F)
| No. of Rows | No. of Columns | |
|---|---|---|
| Train | 1017209 | 9 |
| Test | 41088 | 8 |
| Store | 1115 | 10 |
#a. Train
train <- train %>% mutate(
DayOfWeek = as.factor(DayOfWeek),
Date = as.Date(Date),
Open = as.factor(Open),
Promo = as.factor(Promo),
StateHoliday = as.factor(StateHoliday), # Has 4 values!
SchoolHoliday = as.factor(SchoolHoliday))
str(train)
## 'data.frame': 1017209 obs. of 9 variables:
## $ Store : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : Factor w/ 7 levels "1","2","3","4",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ Date : Date, format: "2015-07-31" "2015-07-31" ...
## $ Sales : int 5263 6064 8314 13995 4822 5651 15344 8492 8565 7185 ...
## $ Customers : int 555 625 821 1498 559 589 1414 833 687 681 ...
## $ Open : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ Promo : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ StateHoliday : Factor w/ 4 levels "0","a","b","c": 1 1 1 1 1 1 1 1 1 1 ...
## $ SchoolHoliday: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
#b. Test
test <- test %>% mutate(
DayOfWeek = as.factor(DayOfWeek),
Date = as.Date(Date),
Open = as.factor(Open),
Promo = as.factor(Promo),
StateHoliday = as.factor(StateHoliday), # Only 2 values! What're the state holidays?
SchoolHoliday = as.factor(SchoolHoliday))
str(test)
## 'data.frame': 41088 obs. of 8 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Store : int 1 3 7 8 9 10 11 12 13 14 ...
## $ DayOfWeek : Factor w/ 7 levels "1","2","3","4",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Date : Date, format: "2015-09-17" "2015-09-17" ...
## $ Open : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ Promo : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ StateHoliday : Factor w/ 2 levels "0","a": 1 1 1 1 1 1 1 1 1 1 ...
## $ SchoolHoliday: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
#c. Store
store <- store %>% mutate(
StoreType = as.factor(StoreType),
Assortment = as.factor(Assortment),
Promo2 = as.factor(Promo2),
PromoInterval = as.factor(PromoInterval),
CompetitionOpenSinceMonth = as.numeric(CompetitionOpenSinceMonth),
CompetitionOpenSinceYear = as.numeric(CompetitionOpenSinceYear))
str(store)
## 'data.frame': 1115 obs. of 10 variables:
## $ Store : int 1 2 3 4 5 6 7 8 9 10 ...
## $ StoreType : Factor w/ 4 levels "a","b","c","d": 3 1 1 3 1 1 1 1 1 1 ...
## $ Assortment : Factor w/ 3 levels "a","b","c": 1 1 1 3 1 1 3 1 3 1 ...
## $ CompetitionDistance : int 1270 570 14130 620 29910 310 24000 7520 2030 3160 ...
## $ CompetitionOpenSinceMonth: num 9 11 12 9 4 12 4 10 8 9 ...
## $ CompetitionOpenSinceYear : num 2008 2007 2006 2009 2015 ...
## $ Promo2 : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 1 1 1 ...
## $ Promo2SinceWeek : int NA 13 14 NA NA NA NA NA NA NA ...
## $ Promo2SinceYear : int NA 2010 2011 NA NA NA NA NA NA NA ...
## $ PromoInterval : Factor w/ 4 levels "","Feb,May,Aug,Nov",..: 1 3 3 1 1 1 1 1 1 1 ...
| No. of NAs | |
|---|---|
| Store | 0 |
| DayOfWeek | 0 |
| Date | 0 |
| Sales | 0 |
| Customers | 0 |
| Open | 0 |
| Promo | 0 |
| StateHoliday | 0 |
| SchoolHoliday | 0 |
| No. of NAs | |
|---|---|
| Id | 0 |
| Store | 0 |
| DayOfWeek | 0 |
| Date | 0 |
| Open | 11 |
| Promo | 0 |
| StateHoliday | 0 |
| SchoolHoliday | 0 |
Observations:
Hence, we will impute 1 into the NA values for the ‘Open’ variable in the test dataset.
# a. Retrieve records with Open = NA
test %>% filter(is.na(Open)) %>% html_df()
| Id | Store | DayOfWeek | Date | Open | Promo | StateHoliday | SchoolHoliday |
|---|---|---|---|---|---|---|---|
| 480 | 622 | 4 | 2015-09-17 | NA | 1 | 0 | 0 |
| 1336 | 622 | 3 | 2015-09-16 | NA | 1 | 0 | 0 |
| 2192 | 622 | 2 | 2015-09-15 | NA | 1 | 0 | 0 |
| 3048 | 622 | 1 | 2015-09-14 | NA | 1 | 0 | 0 |
| 4760 | 622 | 6 | 2015-09-12 | NA | 0 | 0 | 0 |
| 5616 | 622 | 5 | 2015-09-11 | NA | 0 | 0 | 0 |
| 6472 | 622 | 4 | 2015-09-10 | NA | 0 | 0 | 0 |
| 7328 | 622 | 3 | 2015-09-09 | NA | 0 | 0 | 0 |
| 8184 | 622 | 2 | 2015-09-08 | NA | 0 | 0 | 0 |
| 9040 | 622 | 1 | 2015-09-07 | NA | 0 | 0 | 0 |
| 10752 | 622 | 6 | 2015-09-05 | NA | 0 | 0 | 0 |
# b. Impute NA with Open = 1
test <- test %>% mutate(Open = replace(Open, is.na(Open),1))
# c. Check if NA has been replaced:
sum(is.na(test$Open))
## [1] 0
| No. of NAs | |
|---|---|
| Store | 0 |
| StoreType | 0 |
| Assortment | 0 |
| CompetitionDistance | 3 |
| CompetitionOpenSinceMonth | 354 |
| CompetitionOpenSinceYear | 354 |
| Promo2 | 0 |
| Promo2SinceWeek | 544 |
| Promo2SinceYear | 544 |
| PromoInterval | 0 |
With mean
# Impute NA with mean of CompetitionDistance
store <- store %>%
mutate(CompetitionDistance= replace(CompetitionDistance, is.na(CompetitionDistance), mean(CompetitionDistance,na.rm=T)))
# Check for NAs
store %>% filter(is.na(CompetitionDistance)) %>% nrow()
## [1] 0
# Impute NA with median
store <- store %>%
mutate(CompetitionOpenSinceMonth=ifelse(is.na(CompetitionOpenSinceMonth),median(CompetitionOpenSinceMonth,na.rm=T), CompetitionOpenSinceMonth),
CompetitionOpenSinceYear=ifelse(is.na(CompetitionOpenSinceYear),median(CompetitionOpenSinceYear,na.rm=T), CompetitionOpenSinceYear),
Promo2SinceYear=ifelse(is.na(Promo2SinceYear),0, Promo2SinceYear),
Promo2SinceWeek=ifelse(is.na(Promo2SinceWeek),0, Promo2SinceWeek))
# Check no. of NAs for CompetitionOpenSinceMonth/Year
store %>% is.na() %>% colSums() %>% data.frame() %>% `colnames<-`("No. of NAs") %>% html_df
| No. of NAs | |
|---|---|
| Store | 0 |
| StoreType | 0 |
| Assortment | 0 |
| CompetitionDistance | 0 |
| CompetitionOpenSinceMonth | 0 |
| CompetitionOpenSinceYear | 0 |
| Promo2 | 0 |
| Promo2SinceWeek | 0 |
| Promo2SinceYear | 0 |
| PromoInterval | 0 |
plot(train$Date, type = "l")
plot(test$Date, type = "l")
No visible breaks in data, hence no missing data by date.
## Find missing data ##
# Expected rows of records (1115 x 941 days) = 1,049,215 vs Actual = 1,017,209. Missing records = 33,121
# 1. Finding all combinations of stores and dates
allStoresAndDates <- expand.grid(unique(train.store$Store), unique(train.store$Date))
# Explanation
# - List all permutations of stores (1,115) and dates (971 days)
# 2. Naming the two columns in the newly created dataframe for step 3
names(allStoresAndDates) <- c("Store", "Date")
# 3. Extract stores with missing dates and consequently sales data.
missingDatesForStores <- anti_join(allStoresAndDates, train.store, by = c("Store", "Date"))
# Explanation
# - anti_join is a dplyr function that finds unmatched records.
# - 1st parameter = "Master table"
# - 2nd parameter = Comparison table
# - Function checks "train.store" records against "allStoresAndDates" and for
# records that train.store do not have, show it as an output.
# Actual missing records do not equate to expected missing records as some stores may only be opened after the start date.
## Note: Ignoring missing data ##
# As per competition host, Florian, "The missing data you’re observing for a 6 month period in 2014 was a mistake done by us. For some stores this data was simply not included in the train-set. We’ve discussed this with Kaggle and decided that it’s an insignificant omission as there are still more than enough store/date combinations left to create a model on.""
## Conclusion: Proceed to find other missing data.
train.store <- merge(train, store, by = "Store")
test.store <- merge(test, store, by = "Store")
Sunday has the least sales for all opened stores over the data period, and that could be because most stores are closed on Sundays.
# Check if closed stores have any sales. Result = no anomalies.
train.closed <- train[train$Open == 0,]
train.closed$Sales %>% sum()
## [1] 0
# First plot
ggplot(data = train, aes (x= DayOfWeek, y= Sales)) +
geom_bar(stat = "identity")
# Second plot
train %>% group_by(DayOfWeek, Open) %>% tally() %>%
ggplot(aes(x=DayOfWeek, y=n, fill = Open)) +
geom_bar(stat="identity")
# PromotionInterval
ggplot(train.store, aes(x = factor(PromoInterval), y = Sales, color = PromoInterval)) +
geom_col() +
ggtitle("Sales by PromoInterval")
# StoreType
ggplot(train.store, aes(x = Date, y = Sales, color = StoreType))+
geom_smooth(se= F, size = 1.5) +
ggtitle("Sales by StoreType")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(train.store, aes(x = Date, y = Customers, color = StoreType)) +
geom_smooth(se= F, size = 1.5) +
ggtitle("Customers by StoreType")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Assortment
ggplot(train.store, aes(x = Date, y = Sales, color = Assortment)) +
geom_smooth(se= F, size = 1.5) +
ggtitle("Sales by Assortment")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(train.store, aes(x = Date, y = Customers, color = Assortment)) +
geom_smooth(se= F, size = 1.5) +
ggtitle("Customers by Assortment")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## 3. CompetitionDistance, OpenSinceMonth/Year
# Combine year and month into one date variable:
store$CompetitionOpenSince <-as.yearmon(paste(store$CompetitionOpenSinceYear,
store$CompetitionOpenSinceMonth, sep = "-"))
# P.S: yearmon functon creates a numeric vector interpreted in "years" and fractions of years. e.g. 1961.5 = June 1961.
# Histogram for CompetitionOpenedSince
plot_ly(x= store$CompetitionOpenSince, type = "histogram") %>%
layout(title = "Distribution of CompetitionOpenedSince",
xaxis = list(title = "Year",
zeroline = FALSE),
yaxis = list(title = "Count",
zeroline = FALSE))
Observations: Many competitors opened recently, except 1 that opened in 1900 and 1 in 1961.
# Combine year and month into one date variable:
store$Promo2Since <- as.POSIXct(paste(store$Promo2SinceYear,
store$Promo2SinceWeek, 1, sep = "-"),
format = "%Y-%U-%u")
hist(as.numeric(as.POSIXct("2015-10-01", format = "%Y-%m-%d") - store$Promo2Since),
100, main = "Days since start of promo2")
# Histogram for Promo2Since (in days)
plot_ly(x= as.POSIXct("2015-10-01", format = "%Y-%m-%d") - store$Promo2Since, type = "histogram") %>%
layout(title = "Distribution of Promo2Since",
xaxis = list(title = "Days",
zeroline = FALSE),
yaxis = list(title = "Count",
zeroline = FALSE))
## Warning: Ignoring 544 observations
# MeanSales by CompetitionDistance
salesbydist <- train.store %>% group_by(CompetitionDistance) %>% summarise(MeanSales = mean(Sales, na.rm=TRUE))
## NOTE: Plotting without mean makes everthing too cluttered. Code below can't see shit. Followed online guide.
## ggplot(train.store, aes(x = CompetitionDistance, y = Sales)) + geom_point() + geom_smooth()
# salesbydist scatterplot
ggplot(salesbydist, aes(x = CompetitionDistance, y = MeanSales)) +
geom_point() + geom_smooth() + scale_x_log10() + scale_y_log10()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Observations: Interestingly, stores with competition that are closer have slightly higher sales on average while those with competition that are further have slightly lower sales. Just based on this graph alone, we cannot deduce much, but a possibility is that the stores with close competitors are situated in areas with high footfall such as cities, contributing to slightly higher revenue.
train.store <- merge(train, store, by = "Store")
train.store2 <- train.store %>% dplyr:: select(
DayOfWeek, #1
Sales, #2
Customers, #3
Open, #4
Promo, #5
StateHoliday, #6
SchoolHoliday, #7
StoreType, #8
Assortment, #9
CompetitionDistance, #10
Promo2, #11
PromoInterval, #12
CompetitionOpenSince) #13
#Promo2Since) #14
str(train.store2)
## 'data.frame': 1017209 obs. of 13 variables:
## $ DayOfWeek : Factor w/ 7 levels "1","2","3","4",..: 5 6 5 3 3 7 3 1 5 1 ...
## $ Sales : int 5263 4952 4190 6454 3310 0 3591 4770 3836 3722 ...
## $ Customers : int 555 646 552 695 464 0 453 542 466 480 ...
## $ Open : Factor w/ 2 levels "0","1": 2 2 2 2 2 1 2 2 2 2 ...
## $ Promo : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 2 1 1 ...
## $ StateHoliday : Factor w/ 4 levels "0","a","b","c": 1 1 1 1 1 1 1 1 1 1 ...
## $ SchoolHoliday : Factor w/ 2 levels "0","1": 2 1 2 1 1 1 1 1 1 1 ...
## $ StoreType : Factor w/ 4 levels "a","b","c","d": 3 3 3 3 3 3 3 3 3 3 ...
## $ Assortment : Factor w/ 3 levels "a","b","c": 1 1 1 1 1 1 1 1 1 1 ...
## $ CompetitionDistance : num 1270 1270 1270 1270 1270 1270 1270 1270 1270 1270 ...
## $ Promo2 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ PromoInterval : Factor w/ 4 levels "","Feb,May,Aug,Nov",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ CompetitionOpenSince: 'yearmon' num Sep 2008 Sep 2008 Sep 2008 Sep 2008 ...
# Run lm first
train.mlm <- lm(Sales ~., data = train.store2)
str(train.store)
# Ultimate step-wise regression...is useless in feature selection here...
training.swr <- step(train.mlm, direction = "both")
summary(training.swr)
Observation: All variables are significant with stepwise regression. Proceed to classification trees for prediction
# Decision Tree with Rpart function
train.dt <- rpart(Sales ~., data = train.store2, control = rpart.control(cp = 0.0001))
# Choosing the best cp (complexity parameter)
bestcp <- train.dt$cptable[which.min(train.dt$cptable[,"xerror"]),"CP"]
train.dt.pruned <- prune(train.dt, cp = bestcp)
# Confusion matrix
conf.matrix <- table(train.dt.pruned$Sales, predict(train.dt.pruned, na.action = na.pass))
rownames(conf.matrix) <- paste("Actual", rownames(conf.matrix), sep = ":")
colnames(conf.matrix) <- paste("Pred", colnames(conf.matrix), sep = ":")
print(conf.matrix)
# Use training data to predict and assess performance of model
Train.predict <- predict(train.dt.pruned, train.store2, type = "matrix")
confusionMatrix(table(Train.predict, train.store2$Sales),positive = "1")